library(car)
## Loading required package: carData
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(TSstudio)
library(ggplot2)
library(tibble)
library(caret)
## Loading required package: lattice
## Load packages
# Data Prep and EDA
library(knitr)
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
library(corrplot)
## corrplot 0.92 loaded
# Logistic Reg. and Model Selection
library(caTools)
library(car)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-3
library(caret)
# # KNN
# library(kknn)
# # Decision Tree and Random Forest
# library(rpart)
# library(rpart.plot)
# library(randomForest)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
train3 <- read.csv("train3.csv")
test <- read.csv("test.csv")
train=train3[-c(1)]
test=test[-c(1)]
train$RFMSeg=as.factor(train$RFMSeg)
train$Is_Buying_Most_Popular = as.factor(train$Is_Buying_Most_Popular)
train$Country = as.factor(train$Country)
test$RFMSeg=as.factor(test$RFMSeg)
test=test[-c(12)]
test$Is_Buying_Most_Popular = as.factor(test$Is_Buying_Most_Popular)
test$Country = as.factor(test$Country)
# final summary of the dataset
summary(train)
## Orders_Unique Returns_Unique Total_Items_Purchased Quantity_Basket
## Min. : 0.000 Min. : 0.0000 Min. : 0.0 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 0.0000 1st Qu.: 146.0 1st Qu.: 91.29
## Median : 2.000 Median : 0.0000 Median : 326.0 Median : 158.00
## Mean : 3.083 Mean : 0.7027 Mean : 829.2 Mean : 230.20
## 3rd Qu.: 3.000 3rd Qu.: 1.0000 3rd Qu.: 728.5 3rd Qu.: 265.10
## Max. :51.000 Max. :15.0000 Max. :78758.0 Max. :4582.71
## Total_Items_Returned Types_Items_Purchased Unique_Item_Per_Basket
## Min. :-1296.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: -3.50 1st Qu.: 15.00 1st Qu.: 9.333
## Median : 0.00 Median : 32.00 Median : 17.000
## Mean : -13.32 Mean : 46.78 Mean : 21.498
## 3rd Qu.: 0.00 3rd Qu.: 61.00 3rd Qu.: 27.708
## Max. : 0.00 Max. :467.00 Max. :148.500
## Types_Items_Returned Unique_Item_Per_Return Sales_Revenue
## Min. : 0.000 Min. : 0.0000 Min. : 0.0
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 300.7
## Median : 0.000 Median : 0.0000 Median : 609.7
## Mean : 1.418 Mean : 0.8432 Mean : 1392.1
## 3rd Qu.: 1.000 3rd Qu.: 1.0000 3rd Qu.: 1281.8
## Max. :45.000 Max. :45.0000 Max. :110713.1
## Return_Refund Average_Unit_Price_Purchase Average_Unit_Refund_Return
## Min. :-3614.40 Min. : 0.000 Min. : 0.00
## 1st Qu.: -15.90 1st Qu.: 1.447 1st Qu.: 0.00
## Median : 0.00 Median : 1.863 Median : 0.00
## Mean : -38.21 Mean : 2.450 Mean : 3.59
## 3rd Qu.: 0.00 3rd Qu.: 2.433 3rd Qu.: 2.35
## Max. : 0.00 Max. :295.000 Max. :722.88
## Country Is_Buying_Most_Popular Recency
## Others : 121 0:856 Min. : 0.00
## United Kingdom:1258 1:523 1st Qu.: 14.00
## Median : 41.00
## Mean : 54.16
## 3rd Qu.: 82.00
## Max. :181.00
## Y_Income RFMSeg
## Min. : 1.0 0: 37
## 1st Qu.: 341.8 1:1070
## Median : 767.0 2: 272
## Mean : 1885.3
## 3rd Qu.: 1642.1
## Max. :168820.9
full.model <- lm(Y_Income~ ., data = train)
summary(full.model)
##
## Call:
## lm(formula = Y_Income ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20994.5 -519.1 -170.5 366.8 30363.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -325.55358 404.54171 -0.805 0.42111
## Orders_Unique -135.49509 28.03361 -4.833 1.50e-06 ***
## Returns_Unique -88.98896 77.15597 -1.153 0.24896
## Total_Items_Purchased -0.38652 0.09518 -4.061 5.17e-05 ***
## Quantity_Basket -0.20879 0.25732 -0.811 0.41727
## Total_Items_Returned -0.38686 1.15234 -0.336 0.73713
## Types_Items_Purchased -6.05021 2.28675 -2.646 0.00824 **
## Unique_Item_Per_Basket 3.93614 5.07074 0.776 0.43774
## Types_Items_Returned 2.78044 42.28039 0.066 0.94758
## Unique_Item_Per_Return -72.86254 52.38147 -1.391 0.16445
## Sales_Revenue 1.78890 0.06531 27.390 < 2e-16 ***
## Return_Refund -0.40498 0.59072 -0.686 0.49311
## Average_Unit_Price_Purchase -3.71719 6.17393 -0.602 0.54722
## Average_Unit_Refund_Return -4.39604 2.93526 -1.498 0.13445
## CountryUnited Kingdom 342.48439 204.18272 1.677 0.09371 .
## Is_Buying_Most_Popular1 -0.83823 126.44192 -0.007 0.99471
## Recency 0.53403 1.28981 0.414 0.67891
## RFMSeg1 497.31935 352.12084 1.412 0.15807
## RFMSeg2 519.49917 395.29515 1.314 0.18900
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2075 on 1360 degrees of freedom
## Multiple R-squared: 0.8955, Adjusted R-squared: 0.8941
## F-statistic: 647.2 on 18 and 1360 DF, p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(full.model)$coeff[,4]<0.05)
## Orders_Unique Total_Items_Purchased Types_Items_Purchased
## 2 4 7
## Sales_Revenue
## 11
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(full.model)$coeff[,4]>0.05)
## (Intercept) Returns_Unique
## 1 3
## Quantity_Basket Total_Items_Returned
## 5 6
## Unique_Item_Per_Basket Types_Items_Returned
## 8 9
## Unique_Item_Per_Return Return_Refund
## 10 12
## Average_Unit_Price_Purchase Average_Unit_Refund_Return
## 13 14
## CountryUnited Kingdom Is_Buying_Most_Popular1
## 15 16
## Recency RFMSeg1
## 17 18
## RFMSeg2
## 19
paste("number of coefficient:", length(summary(full.model)$coefficients )/4 - 1)
## [1] "number of coefficient: 18"
full.model.transformed <-lm(Y_Income^(1/2)~., data=train)
# Display summary
summary(full.model.transformed)
##
## Call:
## lm(formula = Y_Income^(1/2) ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72.138 -8.590 -1.696 7.189 118.906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.2106116 2.9525255 4.813 1.65e-06 ***
## Orders_Unique 1.5244188 0.2046018 7.451 1.64e-13 ***
## Returns_Unique -0.7471269 0.5631186 -1.327 0.18481
## Total_Items_Purchased -0.0058896 0.0006947 -8.478 < 2e-16 ***
## Quantity_Basket 0.0191830 0.0018781 10.214 < 2e-16 ***
## Total_Items_Returned -0.0362494 0.0084103 -4.310 1.75e-05 ***
## Types_Items_Purchased -0.0009672 0.0166897 -0.058 0.95380
## Unique_Item_Per_Basket 0.0252595 0.0370085 0.683 0.49502
## Types_Items_Returned 1.0104432 0.3085811 3.274 0.00109 **
## Unique_Item_Per_Return -1.6161617 0.3823033 -4.227 2.52e-05 ***
## Sales_Revenue 0.0070713 0.0004767 14.834 < 2e-16 ***
## Return_Refund 0.0053779 0.0043114 1.247 0.21247
## Average_Unit_Price_Purchase -0.0146259 0.0450601 -0.325 0.74554
## Average_Unit_Refund_Return 0.0179782 0.0214229 0.839 0.40150
## CountryUnited Kingdom -0.0875648 1.4902164 -0.059 0.95315
## Is_Buying_Most_Popular1 0.4159744 0.9228295 0.451 0.65223
## Recency -0.0282769 0.0094136 -3.004 0.00271 **
## RFMSeg1 5.8480868 2.5699347 2.276 0.02303 *
## RFMSeg2 9.7567188 2.8850400 3.382 0.00074 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.15 on 1360 degrees of freedom
## Multiple R-squared: 0.6997, Adjusted R-squared: 0.6957
## F-statistic: 176 on 18 and 1360 DF, p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(full.model.transformed)$coeff[,4]<0.05)
## (Intercept) Orders_Unique Total_Items_Purchased
## 1 2 4
## Quantity_Basket Total_Items_Returned Types_Items_Returned
## 5 6 9
## Unique_Item_Per_Return Sales_Revenue Recency
## 10 11 17
## RFMSeg1 RFMSeg2
## 18 19
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(full.model.transformed)$coeff[,4]>0.05)
## Returns_Unique Types_Items_Purchased
## 3 7
## Unique_Item_Per_Basket Return_Refund
## 8 12
## Average_Unit_Price_Purchase Average_Unit_Refund_Return
## 13 14
## CountryUnited Kingdom Is_Buying_Most_Popular1
## 15 16
paste("number of coefficient:", length(summary(full.model.transformed)$coefficients )/4 - 1)
## [1] "number of coefficient: 18"
# Create minimum model including an intercept
min.model <- lm(Y_Income~ 1 , data = train)
# Identify variables not selected by F-B Stepwise regression
# index.step <- which(!(names(coef(full.model)) %in% names(coef(step.model))))
# cat("\n\n\n Variables not selected by forward-backward stepwise:",
# names(coef(full.model)[index.step]))
# Perform stepwise regression
step.model <- step(min.model, scope = list(lower = min.model, upper = full.model),
direction = "both", trace = FALSE)
summary(step.model)
Call: lm(formula = Y_Income ~ Sales_Revenue + Orders_Unique + Total_Items_Purchased + Types_Items_Purchased + Types_Items_Returned + Average_Unit_Refund_Return + Country, data = train)
Residuals: Min 1Q Median 3Q Max -21030.7 -477.5 -165.5 349.8 30513.5
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 163.83102 200.56172 0.817 0.414150
Sales_Revenue 1.80237 0.05951 30.287 < 2e-16 Orders_Unique -147.28508 20.20997 -7.288 5.30e-13 Total_Items_Purchased -0.40919 0.08244 -4.964 7.79e-07 Types_Items_Purchased -5.03032 1.39013 -3.619 0.000307 Types_Items_Returned -46.44894 17.41150 -2.668 0.007727 ** Average_Unit_Refund_Return -3.72667 1.96528 -1.896 0.058136 .
CountryUnited Kingdom 351.03456 201.63276 1.741 0.081915 .
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 2072 on 1371 degrees of freedom Multiple R-squared: 0.8949, Adjusted R-squared: 0.8944 F-statistic: 1668 on 7 and 1371 DF, p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(step.model)$coeff[,4]<0.05)
Sales_Revenue Orders_Unique Total_Items_Purchased
2 3 4
Types_Items_Purchased Types_Items_Returned 5 6
print("--------------------------------------------------------------------------------")
[1] “——————————————————————————–”
## Insignificant variables
# Insignificant Coefficients
which(summary(step.model)$coeff[,4]>0.05)
(Intercept) Average_Unit_Refund_Return
1 7
CountryUnited Kingdom
8
s = summary(step.model)
length(s$coefficients)
[1] 32
paste("number of coefficient:", length(summary(step.model)$coefficients )/4 - 1)
[1] “number of coefficient: 7”
# Box-Cox transformation
bc<-boxCox(step.model)
# Extract optimal lambda
opt.lambda<-bc$x[which.max(bc$y)]
# Rounded optimal lambda
cat("Optimal Lambda = ", round(opt.lambda/0.5)*0.5, end="\n")
## Optimal Lambda = 0.5
step.model.transformed=lm(formula = Y_Income**(1/2) ~ Sales_Revenue + Orders_Unique + Total_Items_Purchased +
Types_Items_Purchased + Types_Items_Returned + Average_Unit_Refund_Return +
Country, data = train)
summary(step.model.transformed)
##
## Call:
## lm(formula = Y_Income^(1/2) ~ Sales_Revenue + Orders_Unique +
## Total_Items_Purchased + Types_Items_Purchased + Types_Items_Returned +
## Average_Unit_Refund_Return + Country, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.953 -9.581 -2.212 7.527 141.597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.7851972 1.5566698 15.280 < 2e-16 ***
## Sales_Revenue 0.0060357 0.0004619 13.067 < 2e-16 ***
## Orders_Unique 1.3435685 0.1568607 8.565 < 2e-16 ***
## Total_Items_Purchased -0.0029727 0.0006398 -4.646 3.71e-06 ***
## Types_Items_Purchased 0.0312137 0.0107896 2.893 0.00388 **
## Types_Items_Returned 0.3591615 0.1351402 2.658 0.00796 **
## Average_Unit_Refund_Return -0.0034860 0.0152536 -0.229 0.81927
## CountryUnited Kingdom -2.3864777 1.5649827 -1.525 0.12751
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.08 on 1371 degrees of freedom
## Multiple R-squared: 0.6587, Adjusted R-squared: 0.6569
## F-statistic: 377.9 on 7 and 1371 DF, p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(step.model.transformed)$coeff[,4]<0.05)
## (Intercept) Sales_Revenue Orders_Unique
## 1 2 3
## Total_Items_Purchased Types_Items_Purchased Types_Items_Returned
## 4 5 6
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(step.model.transformed)$coeff[,4]>0.05)
## Average_Unit_Refund_Return CountryUnited Kingdom
## 7 8
paste("number of coefficient:", length(summary(step.model.transformed)$coefficients )/4 - 1)
## [1] "number of coefficient: 7"
# Set a seed for reproducibility
set.seed(1)
# Set predictors and response to correct format
x.train <- scale(model.matrix(Y_Income ~ ., train)[,-1])
y.train <- scale(train$Y_Income)
x.train_ws <- model.matrix(Y_Income ~ ., train)[,-1]
y.train_ws <- train$Y_Income
# Use cross validation to find optimal lambda
cv.lasso <- cv.glmnet(x.train, y.train, alpha = 1, nfolds = 10)
cv.lasso$lambda.min
## [1] 0.006786743
# Train Lasso and display coefficients with optimal lambda
lasso.model <- glmnet(x.train, y.train, alpha = 1, nlambda = 100)
coef(lasso.model, cv.lasso$lambda.min)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.515894e-17
## Orders_Unique -6.565112e-02
## Returns_Unique -1.163078e-02
## Total_Items_Purchased .
## Quantity_Basket -3.779479e-03
## Total_Items_Returned .
## Types_Items_Purchased -2.668665e-02
## Unique_Item_Per_Basket .
## Types_Items_Returned .
## Unique_Item_Per_Return -1.257738e-02
## Sales_Revenue 9.931592e-01
## Return_Refund .
## Average_Unit_Price_Purchase .
## Average_Unit_Refund_Return -9.763884e-03
## CountryUnited Kingdom 3.745255e-03
## Is_Buying_Most_Popular1 .
## Recency 5.293025e-03
## RFMSeg1 5.527952e-04
## RFMSeg2 .
# Identify variables not selected by Lasso
index.lasso <- which(coef(lasso.model, cv.lasso$lambda.min) == 0)
cat("\n\n\n Variables not selected by lasso regression: ",
names(coef(full.model)[index.lasso]))
##
##
##
## Variables not selected by lasso regression: Total_Items_Purchased Total_Items_Returned Unique_Item_Per_Basket Types_Items_Returned Return_Refund Average_Unit_Price_Purchase Is_Buying_Most_Popular1 RFMSeg2
# Retrain OLS model using Lasso-selected predictors
lasso.predictors <- as.data.frame(x.train_ws)[-(index.lasso-1)]
lasso.retrained <- lm(y.train_ws ~ ., data = lasso.predictors)
summary(lasso.retrained)
##
## Call:
## lm(formula = y.train_ws ~ ., data = lasso.predictors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19236 -518 -196 344 32005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 228.05282 273.66698 0.833 0.4048
## Orders_Unique -139.83618 25.03943 -5.585 2.82e-08 ***
## Returns_Unique -44.07081 55.48524 -0.794 0.4272
## Quantity_Basket -0.45998 0.23093 -1.992 0.0466 *
## Types_Items_Purchased -3.93727 1.43079 -2.752 0.0060 **
## Unique_Item_Per_Return -47.22059 26.67094 -1.770 0.0769 .
## Sales_Revenue 1.53739 0.01993 77.148 < 2e-16 ***
## Average_Unit_Refund_Return -3.40148 2.00504 -1.696 0.0900 .
## `CountryUnited Kingdom` 240.09303 203.61760 1.179 0.2385
## Recency 0.97199 1.24052 0.784 0.4334
## RFMSeg1 63.72478 161.33859 0.395 0.6929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2088 on 1368 degrees of freedom
## Multiple R-squared: 0.8936, Adjusted R-squared: 0.8928
## F-statistic: 1148 on 10 and 1368 DF, p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(lasso.retrained)$coeff[,4]<0.05)
## Orders_Unique Quantity_Basket Types_Items_Purchased
## 2 4 5
## Sales_Revenue
## 7
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(lasso.retrained)$coeff[,4]>0.05)
## (Intercept) Returns_Unique
## 1 3
## Unique_Item_Per_Return Average_Unit_Refund_Return
## 6 8
## `CountryUnited Kingdom` Recency
## 9 10
## RFMSeg1
## 11
paste("number of coefficient:", length(summary(lasso.retrained)$coefficients )/4 - 1)
## [1] "number of coefficient: 10"
#Plot the regression coefficient path.
set.seed(1)
lassomodel = glmnet(x.train, y.train, alpha=1, nlambda=100)
## Plot coefficient paths
plot(lassomodel, xvar="lambda", label=TRUE, lwd=2)
abline(v=log(cv.lasso$lambda.min), col='black', lty=2, lwd=2)
# Box-Cox transformation
bc<-boxCox(lasso.retrained)
# Extract optimal lambda
opt.lambda<-bc$x[which.max(bc$y)]
# Rounded optimal lambda
cat("Optimal Lambda = ", round(opt.lambda/0.5)*0.5, end="\n")
## Optimal Lambda = 0.5
lasso.retrained.transformed <- lm(y.train_ws**(1/2) ~ ., data = lasso.predictors)
summary(lasso.retrained.transformed)
##
## Call:
## lm(formula = y.train_ws^(1/2) ~ ., data = lasso.predictors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.74 -8.82 -1.79 7.09 135.74
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.474078 2.059502 11.398 < 2e-16 ***
## Orders_Unique 1.561800 0.188436 8.288 2.71e-16 ***
## Returns_Unique 1.100590 0.417558 2.636 0.00849 **
## Quantity_Basket 0.015342 0.001738 8.828 < 2e-16 ***
## Types_Items_Purchased 0.018765 0.010767 1.743 0.08161 .
## Unique_Item_Per_Return -0.157339 0.200714 -0.784 0.43324
## Sales_Revenue 0.003314 0.000150 22.099 < 2e-16 ***
## Average_Unit_Refund_Return -0.015773 0.015089 -1.045 0.29605
## `CountryUnited Kingdom` -1.525716 1.532340 -0.996 0.31958
## Recency -0.021147 0.009336 -2.265 0.02366 *
## RFMSeg1 -2.209740 1.214166 -1.820 0.06898 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.71 on 1368 degrees of freedom
## Multiple R-squared: 0.6749, Adjusted R-squared: 0.6725
## F-statistic: 284 on 10 and 1368 DF, p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(lasso.retrained.transformed)$coeff[,4]<0.05)
## (Intercept) Orders_Unique Returns_Unique Quantity_Basket Sales_Revenue
## 1 2 3 4 7
## Recency
## 10
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(lasso.retrained.transformed)$coeff[,4]>0.05)
## Types_Items_Purchased Unique_Item_Per_Return
## 5 6
## Average_Unit_Refund_Return `CountryUnited Kingdom`
## 8 9
## RFMSeg1
## 11
paste("number of coefficient:", length(summary(lasso.retrained.transformed)$coefficients )/4 - 1)
## [1] "number of coefficient: 10"
# Set a seed for reproducibility
set.seed(1)
# Use cross validation to find optimal lambda
cv.elnet <- cv.glmnet(x.train, y.train, alpha = 0.5)
# Train Elastic Net and display coefficients with optimal lambda
elnet.model <- glmnet(x.train, y.train, alpha = 0.5)
coef(elnet.model, cv.elnet$lambda.min)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.472182e-17
## Orders_Unique -6.769646e-02
## Returns_Unique -1.254931e-02
## Total_Items_Purchased .
## Quantity_Basket -6.250172e-03
## Total_Items_Returned .
## Types_Items_Purchased -2.705066e-02
## Unique_Item_Per_Basket .
## Types_Items_Returned .
## Unique_Item_Per_Return -1.605573e-02
## Sales_Revenue 9.934581e-01
## Return_Refund -6.694274e-03
## Average_Unit_Price_Purchase .
## Average_Unit_Refund_Return -1.519984e-02
## CountryUnited Kingdom 5.812796e-03
## Is_Buying_Most_Popular1 .
## Recency 6.381197e-03
## RFMSeg1 1.584849e-03
## RFMSeg2 .
cv.elnet$lambda.min
## [1] 0.008524545
# Identify variables not selected by Elastic Net
index.elnet <- which(coef(elnet.model, cv.elnet$lambda.min) == 0)
cat("\n\n\n Variables not selected by elastic net regression:",
names(coef(full.model)[index.elnet]))
##
##
##
## Variables not selected by elastic net regression: Total_Items_Purchased Total_Items_Returned Unique_Item_Per_Basket Types_Items_Returned Average_Unit_Price_Purchase Is_Buying_Most_Popular1 RFMSeg2
elnet.predictors <- as.data.frame(x.train_ws)[-(index.elnet-1)]
elnet.retrained <- lm(y.train_ws ~ ., data = elnet.predictors)
summary(elnet.retrained)
##
## Call:
## lm(formula = y.train_ws ~ ., data = elnet.predictors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18973 -523 -198 350 32061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 266.51592 274.41368 0.971 0.33161
## Orders_Unique -146.00608 25.28428 -5.775 9.54e-09 ***
## Returns_Unique -55.83016 55.87726 -0.999 0.31790
## Quantity_Basket -0.47465 0.23093 -2.055 0.04003 *
## Types_Items_Purchased -3.76505 1.43339 -2.627 0.00872 **
## Unique_Item_Per_Return -66.11525 28.87933 -2.289 0.02221 *
## Sales_Revenue 1.53108 0.02026 75.580 < 2e-16 ***
## Return_Refund -0.76975 0.45302 -1.699 0.08952 .
## Average_Unit_Refund_Return -6.00220 2.52138 -2.381 0.01742 *
## `CountryUnited Kingdom` 251.32954 203.58474 1.235 0.21722
## Recency 0.78335 1.24463 0.629 0.52920
## RFMSeg1 49.22705 161.45304 0.305 0.76049
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2086 on 1367 degrees of freedom
## Multiple R-squared: 0.8938, Adjusted R-squared: 0.8929
## F-statistic: 1046 on 11 and 1367 DF, p-value: < 2.2e-16
# Signiciant Coefficients
which(summary(elnet.retrained)$coeff[,4]<0.05)
## Orders_Unique Quantity_Basket
## 2 4
## Types_Items_Purchased Unique_Item_Per_Return
## 5 6
## Sales_Revenue Average_Unit_Refund_Return
## 7 9
print("--------------------------------------------------------------------------------")
## [1] "--------------------------------------------------------------------------------"
## Insignificant variables
# Insignificant Coefficients
which(summary(elnet.retrained)$coeff[,4]>0.05)
## (Intercept) Returns_Unique Return_Refund
## 1 3 8
## `CountryUnited Kingdom` Recency RFMSeg1
## 10 11 12
paste("number of coefficient:", length(summary(elnet.retrained)$coefficients )/4 - 1)
## [1] "number of coefficient: 11"
# Identify variables not selected by F-B Stepwise regression
index.step <- which(!(names(coef(full.model)) %in% names(coef(step.model))))
cat("\n\n\n Variables not selected by forward-backward stepwise:",
names(coef(full.model)[index.step]))
##
##
##
## Variables not selected by forward-backward stepwise: Returns_Unique Quantity_Basket Total_Items_Returned Unique_Item_Per_Basket Unique_Item_Per_Return Return_Refund Average_Unit_Price_Purchase Is_Buying_Most_Popular1 Recency RFMSeg1 RFMSeg2
# Identify variables not selected by Lasso
index.lasso <- which(coef(lasso.model, cv.lasso$lambda.min) == 0)
cat("\n\n\n Variables not selected by lasso regression: ",
names(coef(full.model)[index.lasso]))
##
##
##
## Variables not selected by lasso regression: Total_Items_Purchased Total_Items_Returned Unique_Item_Per_Basket Types_Items_Returned Return_Refund Average_Unit_Price_Purchase Is_Buying_Most_Popular1 RFMSeg2
# Identify variables not selected by Elastic Net
index.elnet <- which(coef(elnet.model, cv.elnet$lambda.min) == 0)
cat("\n\n\n Variables not selected by elastic net regression:",
names(coef(full.model)[index.elnet]))
##
##
##
## Variables not selected by elastic net regression: Total_Items_Purchased Total_Items_Returned Unique_Item_Per_Basket Types_Items_Returned Average_Unit_Price_Purchase Is_Buying_Most_Popular1 RFMSeg2
Now, we are on to do the predictions using the models we just created. A classification threshold of 0.5 is used. Note that this threshold could be tuned depending on the sensitivity/specificity tolerance. In this case, it becomes important to identify people that are likely to churn so that the corrective measures can be taken. This means lowering the threshold could be a good idea even if it results in more False Positive cases.
# 1. Prediction for the full model and transformed
# Obtain predicted probabilities for the test set
pred.full = predict(full.model, newdata = test, type = "response")
pred.full.transformed = predict(full.model.transformed, newdata = test, type = "response")**2
# 2. Prediction for the stepwise regression
# Obtain predicted probabilities for the test set
pred.step = predict(step.model, newdata = test, type = "response")
pred.step.transformed=predict(step.model.transformed, newdata = test, type = "response")**2
# 3. Prediction for the lasso regression
# Set test data to correct format
new_test <- model.matrix( ~ ., test)[,-1]
# Obtain predicted probabilities for the test set
pred.lasso = predict(lasso.retrained, newdata = as.data.frame(new_test),
type = "response")
pred.lasso.transformed = predict(lasso.retrained.transformed, newdata = as.data.frame(new_test),
type = "response")**2
# 4. Prediction for elastic net regression
# Set predictors to correct format
x.test <- model.matrix(Y_Income ~ ., test)[,-1]
# Obtain predicted probabilities for the test set
# pred.elnet = as.vector(predict(elnet.model, newx = x.test,
# type = "response", s = cv.elnet$lambda.min))
pred.elnet=predict(elnet.retrained, newdata = as.data.frame(x.test),
type = "response")
# Create a data frame with the predictions
preds = data.frame(Y_Income = test$Y_Income, pred.full,pred.full.transformed,
pred.step,pred.step.transformed, pred.lasso,pred.lasso.transformed, pred.elnet)
mspe <-function(prediction, testData)
{ return(mean((testData - prediction)^2))}
mae <-function(prediction, testData) {return(mean(abs(testData - prediction)))}
mape <-function(prediction, testData) {return(mean(abs(testData - prediction)/testData))}
pm <-function(prediction, testData) {return(sum((testData - prediction)^2)/sum((testData - mean(testData))^2))}
report_result = data.frame(matrix(ncol=4,nrow=0, dimnames=list(NULL,c("MSPE", "MAE", "MAPE", "PM"))))
for (i in c(2:8)){
testData=preds[,1]
prediction=preds[,i]
mspe_result = mspe(prediction, testData)
mae_result = mae(prediction, testData)
mape_result = mape(prediction, testData)
pm_result = pm(prediction, testData)
# print(nrow(report_result2))
# print( c(mspe_result, mae_result, mape_result, pm_result) )
report_result[nrow(report_result)+1,] = c(mspe_result, mae_result, mape_result, pm_result)
}
rownames(report_result) <- c("Full", "Full-Transformed", "Step-Wise","Step-Wise-Transformed","Lasso", "Lasso-Transformed","ENet")
Adding R squared and Adjusted r squared
report_result$R.Squared=c(summary(full.model)$r.squared,summary(full.model.transformed)$r.squared,summary(step.model)$r.squared,summary(step.model.transformed)$r.squared,summary(lasso.retrained)$r.squared,summary(lasso.retrained.transformed)$r.squared,summary(elnet.retrained)$r.squared)
report_result$Adj.R.Squared=c(summary(full.model)$adj.r.squared,summary(full.model.transformed)$adj.r.squared,summary(step.model)$adj.r.squared,summary(step.model.transformed)$adj.r.squared,summary(lasso.retrained)$adj.r.squared,summary(lasso.retrained.transformed)$adj.r.squared,summary(elnet.retrained)$adj.r.squared)
report_result$number_of_coefficients =c("19","19","8","8","11","11","12")
report_result$number_of_significant_coefficients =c("4","11","3","6","4","6","6")
report_result$OveralL_GOF=c("N","N","N","N","N","N","N")
report_result$Linearity=c("N","N","N","N","N","N","N")
report_result$ConstantVariance=c("N","N","N","N","N","N","N")
report_result$Independence=c("N","N","N","N","N","N","N")
report_result$Normality=c("N","N","N","N","N","N","N")
# Get standardized residuals
resids = rstandard(full.model)
par(mfrow=c(2,2))
for (i in c(1:18)){
col_name = names(train3[i])
if (!(i %in% c(14,15,17,18))){
plot(train3[,i], resids, xlab= col_name, ylab = "S. Residuals")
abline(h=0, col="red")
lines(lowess(train3[,i], resids), col='blue')
}
}
# Checking for constant variance and uncorrelated errors
# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")
summary_full_model = summary(full.model)
# Plots for normality
hist(resids, col="orange", nclass=15)
qqPlot(resids)
## [1] 667 122
report_result$OveralL_GOF[1] = "not good"
report_result$Linearity[1] = "Seems to be holding"
report_result$ConstantVariance[1] = "Does not seem to be holding"
report_result$Independence[1] = "Errors are uncorrelated"
report_result$Normality[1] = "Does not seem to be holding"
summary_full_model
##
## Call:
## lm(formula = Y_Income ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20994.5 -519.1 -170.5 366.8 30363.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -325.55358 404.54171 -0.805 0.42111
## Orders_Unique -135.49509 28.03361 -4.833 1.50e-06 ***
## Returns_Unique -88.98896 77.15597 -1.153 0.24896
## Total_Items_Purchased -0.38652 0.09518 -4.061 5.17e-05 ***
## Quantity_Basket -0.20879 0.25732 -0.811 0.41727
## Total_Items_Returned -0.38686 1.15234 -0.336 0.73713
## Types_Items_Purchased -6.05021 2.28675 -2.646 0.00824 **
## Unique_Item_Per_Basket 3.93614 5.07074 0.776 0.43774
## Types_Items_Returned 2.78044 42.28039 0.066 0.94758
## Unique_Item_Per_Return -72.86254 52.38147 -1.391 0.16445
## Sales_Revenue 1.78890 0.06531 27.390 < 2e-16 ***
## Return_Refund -0.40498 0.59072 -0.686 0.49311
## Average_Unit_Price_Purchase -3.71719 6.17393 -0.602 0.54722
## Average_Unit_Refund_Return -4.39604 2.93526 -1.498 0.13445
## CountryUnited Kingdom 342.48439 204.18272 1.677 0.09371 .
## Is_Buying_Most_Popular1 -0.83823 126.44192 -0.007 0.99471
## Recency 0.53403 1.28981 0.414 0.67891
## RFMSeg1 497.31935 352.12084 1.412 0.15807
## RFMSeg2 519.49917 395.29515 1.314 0.18900
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2075 on 1360 degrees of freedom
## Multiple R-squared: 0.8955, Adjusted R-squared: 0.8941
## F-statistic: 647.2 on 18 and 1360 DF, p-value: < 2.2e-16
# Get standardized residuals
resids = rstandard(full.model.transformed)
par(mfrow=c(2,2))
for (i in c(1:18)){
col_name = names(train3[i])
if (!(i %in% c(14,15,17,18))){
plot(train3[,i], resids, xlab= col_name, ylab = "S. Residuals")
abline(h=0, col="red")
lines(lowess(train3[,i], resids), col='blue')
}
}
# Checking for constant variance and uncorrelated errors
# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")
# Plots for normality
hist(resids, col="orange", nclass=15)
qqPlot(resids)
## [1] 122 498
report_result$OveralL_GOF[2] = "Average"
report_result$Linearity[2] = "Seems to be holding"
report_result$ConstantVariance[2] = " seem to be holding"
report_result$Independence[2] = "Errors are uncorrelated"
report_result$Normality[2] = "Improved and moderaetly holding"
# Get standardized residuals
resids = rstandard(step.model)
par(mfrow=c(2,2))
for (i in c(1:18)){
col_name = names(train[i])
if ((col_name %in% c("Sales_Revenue" , "Return_Refund" ,
"Total_Items_Purchased" , "Types_Items_Purchased" , "Unique_Item_Per_Basket" ,
"Is_Buying_Most_Popular", "Country", "Quantity_Basket", "Recency",
"RFMSeg"))){
plot(train[,i], resids, xlab= col_name, ylab = "S. Residuals")
abline(h=0, col="red")
lines(lowess(train[,i], resids), col='blue')
}
}
# Checking for constant variance and uncorrelated errors
# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")
# Plots for normality
hist(resids, col="orange", nclass=15)
qqPlot(resids)
## [1] 122 667
report_result$OveralL_GOF[3] = "Average"
report_result$Linearity[3] = "Seems to be holding"
report_result$ConstantVariance[3] = "not clearly holding"
report_result$Independence[3] = "Errors are uncorrelated"
report_result$Normality[3] = "Does not seemd to be holding"
# Get standardized residuals
resids = rstandard(step.model.transformed)
par(mfrow=c(2,2))
for (i in c(1:18)){
col_name = names(train[i])
if ((col_name %in% c("Sales_Revenue" , "Return_Refund" ,
"Total_Items_Purchased" , "Types_Items_Purchased" , "Unique_Item_Per_Basket" ,
"Is_Buying_Most_Popular", "Country", "Quantity_Basket", "Recency",
"RFMSeg"))){
plot(train[,i], resids, xlab= col_name, ylab = "S. Residuals")
abline(h=0, col="red")
lines(lowess(train[,i], resids), col='blue')
}
}
# Checking for constant variance and uncorrelated errors
# Plot of std. residuals versus fitted values
plot(step.model.transformed$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model.transformed$fitted.values, resids), col='blue')
abline(h=0, col="red")
# Plots for normality
hist(resids, col="orange", nclass=15)
qqPlot(resids)
## [1] 511 667
# OveralL_GOF = Average
# Linearity = Seems to be holding
# ConstantVariance = seem to be holding
# Independence = Errors are uncorrelated
# Normality = Improved and moderaetly holding
report_result$OveralL_GOF[4] = "Average"
report_result$Linearity[4] = "Seems to be holding"
report_result$ConstantVariance[4] = "seem to be holding"
report_result$Independence[4] = "Errors are uncorrelated"
report_result$Normality[4] = "Improved and moderaetly holding"
# Get standardized residuals
resids = rstandard(lasso.retrained)
par(mfrow=c(2,2))
for (i in c(1:ncol(lasso.predictors))){
col_name = names(lasso.predictors[i])
if (!(i %in% c(14,15,17,18))){
plot(lasso.predictors[,i], resids, xlab= col_name, ylab = "S. Residuals")
abline(h=0, col="red")
lines(lowess(lasso.predictors[,i], resids), col='blue')
}
}
# Checking for constant variance and uncorrelated errors
# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")
# Plots for normality
hist(resids, col="orange", nclass=15)
qqPlot(resids)
## [1] 667 122
# OveralL_GOF = Average
# Linearity = Seems to be holding
# ConstantVariance = Does not seem to be clearly holding
# Independence = Errors are uncorrelated
# Normality = Does not seem to be holding
report_result$OveralL_GOF[5] = "Average"
report_result$Linearity[5] = "Seems to be holding"
report_result$ConstantVariance[5] = "Does not seem to be clearly holding"
report_result$Independence[5] = "Errors are uncorrelated"
report_result$Normality[5] = "Does not seem to be holding"
# Get standardized residuals
resids = rstandard(lasso.retrained.transformed)
par(mfrow=c(2,2))
for (i in c(1:ncol(lasso.predictors))){
col_name = names(lasso.predictors[i])
if (!(i %in% c(14,15,17,18))){
plot(lasso.predictors[,i], resids, xlab= col_name, ylab = "S. Residuals")
abline(h=0, col="red")
lines(lowess(lasso.predictors[,i], resids), col='blue')
}
}
# Checking for constant variance and uncorrelated errors
# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")
# Plots for normality
hist(resids, col="orange", nclass=15)
qqPlot(resids)
## [1] 122 667
# OveralL_GOF = Average
# Linearity = Seems to be holding
# ConstantVariance = seem to be holding
# Independence = Errors are uncorrelated
# Normality = Improved and moderaetly holding
report_result$OveralL_GOF[6] = "Average"
report_result$Linearity[6] = "Seems to be holding"
report_result$ConstantVariance[6] = "seem to be holding"
report_result$Independence[6] = "Errors are uncorrelated"
report_result$Normality[6] = "Improved and moderaetly holding"
# Get standardized residuals
resids = rstandard(elnet.retrained)
par(mfrow=c(2,2))
for (i in c(1:ncol(elnet.predictors))){
col_name = names(elnet.predictors[i])
if (!(i %in% c(14,15,17,18))){
plot(elnet.predictors[,i], resids, xlab= col_name, ylab = "S. Residuals")
abline(h=0, col="red")
lines(lowess(elnet.predictors[,i], resids), col='blue')
}
}
# Checking for constant variance and uncorrelated errors
# Plot of std. residuals versus fitted values
plot(step.model$fitted.values, resids, xlab="Fitted Values", ylab=" S. Residuals")
lines(lowess(step.model$fitted.values, resids), col='blue')
abline(h=0, col="red")
# Plots for normality
hist(resids, col="orange", nclass=15)
qqPlot(resids)
## [1] 667 122
# OveralL_GOF = Average
# Linearity = Seems to be holding
# ConstantVariance = Does not seem to be clearly holding
# Independence = Errors are uncorrelated
# Normality = Does not seem to be holding
report_result$OveralL_GOF[7] = "Average"
report_result$Linearity[7] = "Seems to be holding"
report_result$ConstantVariance[7] = "Does not seem to be clearly holding"
report_result$Independence[7] = "Errors are uncorrelated"
report_result$Normality[7] = "Does not seem to be holding"
report_result
## MSPE MAE MAPE PM R.Squared
## Full 13025690 1187.4253 48.05262 0.24220980 0.8954624
## Full-Transformed 4621712 939.8901 38.18104 0.08593969 0.6996922
## Step-Wise 12953894 1170.0673 48.53398 0.24087476 0.8949329
## Step-Wise-Transformed 6125621 990.7629 24.20578 0.11390455 0.6586523
## Lasso 13762029 1165.0353 45.12102 0.25590185 0.8935603
## Lasso-Transformed 12525764 1022.4075 31.18209 0.23291378 0.6749004
## ENet 14255322 1191.4448 45.07683 0.26507452 0.8937846
## Adj.R.Squared number_of_coefficients
## Full 0.8940788 19
## Full-Transformed 0.6957176 19
## Step-Wise 0.8943965 8
## Step-Wise-Transformed 0.6569095 8
## Lasso 0.8927823 11
## Lasso-Transformed 0.6725240 11
## ENet 0.8929300 12
## number_of_significant_coefficients OveralL_GOF
## Full 4 not good
## Full-Transformed 11 Average
## Step-Wise 3 Average
## Step-Wise-Transformed 6 Average
## Lasso 4 Average
## Lasso-Transformed 6 Average
## ENet 6 Average
## Linearity ConstantVariance
## Full Seems to be holding Does not seem to be holding
## Full-Transformed Seems to be holding seem to be holding
## Step-Wise Seems to be holding not clearly holding
## Step-Wise-Transformed Seems to be holding seem to be holding
## Lasso Seems to be holding Does not seem to be clearly holding
## Lasso-Transformed Seems to be holding seem to be holding
## ENet Seems to be holding Does not seem to be clearly holding
## Independence Normality
## Full Errors are uncorrelated Does not seem to be holding
## Full-Transformed Errors are uncorrelated Improved and moderaetly holding
## Step-Wise Errors are uncorrelated Does not seemd to be holding
## Step-Wise-Transformed Errors are uncorrelated Improved and moderaetly holding
## Lasso Errors are uncorrelated Does not seem to be holding
## Lasso-Transformed Errors are uncorrelated Improved and moderaetly holding
## ENet Errors are uncorrelated Does not seem to be holding
test_predictions_results_comparison <- report_result[1:6]
variable_selection <- report_result[7:8]
goodness_of_fit <- report_result[9:13]
test_predictions_results_comparison
## MSPE MAE MAPE PM R.Squared
## Full 13025690 1187.4253 48.05262 0.24220980 0.8954624
## Full-Transformed 4621712 939.8901 38.18104 0.08593969 0.6996922
## Step-Wise 12953894 1170.0673 48.53398 0.24087476 0.8949329
## Step-Wise-Transformed 6125621 990.7629 24.20578 0.11390455 0.6586523
## Lasso 13762029 1165.0353 45.12102 0.25590185 0.8935603
## Lasso-Transformed 12525764 1022.4075 31.18209 0.23291378 0.6749004
## ENet 14255322 1191.4448 45.07683 0.26507452 0.8937846
## Adj.R.Squared
## Full 0.8940788
## Full-Transformed 0.6957176
## Step-Wise 0.8943965
## Step-Wise-Transformed 0.6569095
## Lasso 0.8927823
## Lasso-Transformed 0.6725240
## ENet 0.8929300
variable_selection
## number_of_coefficients number_of_significant_coefficients
## Full 19 4
## Full-Transformed 19 11
## Step-Wise 8 3
## Step-Wise-Transformed 8 6
## Lasso 11 4
## Lasso-Transformed 11 6
## ENet 12 6
goodness_of_fit
## OveralL_GOF Linearity
## Full not good Seems to be holding
## Full-Transformed Average Seems to be holding
## Step-Wise Average Seems to be holding
## Step-Wise-Transformed Average Seems to be holding
## Lasso Average Seems to be holding
## Lasso-Transformed Average Seems to be holding
## ENet Average Seems to be holding
## ConstantVariance
## Full Does not seem to be holding
## Full-Transformed seem to be holding
## Step-Wise not clearly holding
## Step-Wise-Transformed seem to be holding
## Lasso Does not seem to be clearly holding
## Lasso-Transformed seem to be holding
## ENet Does not seem to be clearly holding
## Independence Normality
## Full Errors are uncorrelated Does not seem to be holding
## Full-Transformed Errors are uncorrelated Improved and moderaetly holding
## Step-Wise Errors are uncorrelated Does not seemd to be holding
## Step-Wise-Transformed Errors are uncorrelated Improved and moderaetly holding
## Lasso Errors are uncorrelated Does not seem to be holding
## Lasso-Transformed Errors are uncorrelated Improved and moderaetly holding
## ENet Errors are uncorrelated Does not seem to be holding
write.csv(test_predictions_results_comparison,"output_test_predictions_results_comparison.csv")
write.csv(variable_selection,"output_variable_selection.csv")
write.csv(goodness_of_fit,"output_goodness_of_fit.csv")
future_data <- read.csv("2nd_months_predicting_variables.csv")
# future_data=future_data[-c(1,13)]
future_data$Country2[future_data$Country !="United Kingdom"] <- "Others"
future_data$Country2[future_data$Country =="United Kingdom"] <- "United Kingdom"
future_data$Country=as.factor(future_data$Country2)
future_data$Is_Buying_Most_Popular = as.factor(future_data$Is_Buying_Most_Popular)
future_data$Country = as.factor(future_data$Country)
future_data$RFMSeg=as.integer((future_data$Recency_Quantile+future_data$Frequency_Quantile+future_data$Monetory_Value_Quantile))
future_data$RFMSeg=as.factor(future_data$RFMSeg)
####### Using the model buit to score on Future Data
future_data_predicted = predict(full.model.transformed, newdata = future_data, type = "response")**2
revenue=sum(future_data_predicted)
gross=1/100
###churn=15.00/100 ###Assuming 15% Churn
###Assuming no Churn
NET_CLV_6months=revenue*gross
print(revenue)
## [1] 28030947
print(NET_CLV_6months)
## [1] 280309.5
# Creates bin
Groups <- cut(x=as.numeric(future_data_predicted)/1000, breaks=seq(from=0, to=ceiling(15), by = 1))
Bygroup = tapply(future_data_predicted, Groups, length)
####Barplot
b=barplot(height = Bygroup, xlab = "Predicted $ Revenue ( in thousands) ", ylab = "#Customers")
Bygroup
## (0,1] (1,2] (2,3] (3,4] (4,5] (5,6] (6,7] (7,8] (8,9] (9,10]
## 2262 878 218 68 34 26 16 9 6 7
## (10,11] (11,12] (12,13] (13,14] (14,15]
## 8 5 3 3 3
Bygroup/sum(Bygroup)
## (0,1] (1,2] (2,3] (3,4] (4,5] (5,6]
## 0.6379018613 0.2476029329 0.0614777214 0.0191765369 0.0095882685 0.0073322053
## (6,7] (7,8] (8,9] (9,10] (10,11] (11,12]
## 0.0045121263 0.0025380711 0.0016920474 0.0019740553 0.0022560632 0.0014100395
## (12,13] (13,14] (14,15]
## 0.0008460237 0.0008460237 0.0008460237